clear all %#ok<CLALL>

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%process CRSP data
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%CRSP-files were downloaded from WRDS on February 11, 2025
%Company Code: PERMNO. 
%Restricted to NYSE, AMEX and NASDAQ stocks
%common ordinary shares 
%shrcd BETWEEN 9.5 AND 11.5 AND exchcd BETWEEN 0.5 AND 3.5 

%%%%%%%%%%%
%load monthly CRSP file
%Variables: CUSIP SHRCD EXCHCD RET DLSTCD DLRET
CRSP = readtable('CRSPmonthly.csv','Delimiter',',','Format','%f%f%f%f%s%f%f%f','HeaderLines',0,'ReadVariableNames', true, 'TreatAsEmpty',{'C','A','S','T','P'});
CRSP = CRSP(:,["PERMNO","date","CUSIP","RET","DLSTCD","DLRET"]);

%sort by permno and date
CRSP=sortrows(CRSP,["PERMNO","date"]);

%%%adjusting stock returns for delisting following Shumway (1997)
%If there is a delisting code in CRSP, use of delisting return if available
%If there is no delisting return available, assume a delisting return
%of -0.3 if delisting code is in 'delistingcodes'.
%Otherwise, assume a delisting return of -1.
delistingcodes = [500, 520, 551:574, 580, 584];
CRSP{CRSP{:,"DLSTCD"}==100,"DLSTCD"} = NaN; %stock was still active
CRSP{~isnan(CRSP{:,"DLSTCD"}) & ~ismember(CRSP{:,"DLSTCD"},delistingcodes),"RET"} = -1;
CRSP{~isnan(CRSP{:,"DLSTCD"}) &  ismember(CRSP{:,"DLSTCD"},delistingcodes),"RET"} = -0.3;
CRSP{~isnan(CRSP{:,"DLSTCD"}) & ~isnan(CRSP{:,"DLRET"}),"RET"}                    = CRSP{~isnan(CRSP{:,"DLSTCD"}) & ~isnan(CRSP{:,"DLRET"}),"DLRET"};
clear delistingcodes
%%%

CRSP{:,"month"} = floor(CRSP{:,"date"}/100);

%construct data set and use all monthly CRSP observations as formation
%month that have a monthly return and were not delisted in that month
datasettemp    = CRSP(~isnan(CRSP{:,"RET"}) & isnan(CRSP{:,"DLSTCD"}),["PERMNO","CUSIP","date","month"]);
dataset.PERMNO = table2array(datasettemp(:,"PERMNO"));
dataset.CUSIP  = datasettemp(:,"CUSIP");
dataset.date   = table2array(datasettemp(:,"date"));
dataset.month  = table2array(datasettemp(:,"month"));
clear datasettemp;

monthun  = unique(dataset.month);
permnoun = unique(dataset.PERMNO);
CRSP     = CRSP(ismember(CRSP{:,"month"},monthun) & ismember(CRSP{:,"PERMNO"},permnoun),:);

%Create return wide matrix from long matrix
retmmonth = table2array(unstack(CRSP(:,["PERMNO","date","RET"]),'RET','PERMNO'));
retmmonth = sortrows(retmmonth,1);
retmmonth = retmmonth(:,2:end);



%%%%%%%%%%%
%load daily CRSP file
%Variables: CUSIP SHRCD EXCHCD PRC VOL OPENPRC BID ASK RET SHROUT DLSTCD DLRET
CRSP = readtable('CRSP.csv','Delimiter',',','Format','%f%f%f%f%s%f%f%f%f%f%f%f%f%f','HeaderLines',0,'ReadVariableNames', true, 'TreatAsEmpty',{'C','A','S','T','P'});
CRSP = CRSP(:,["PERMNO","date","CUSIP","PRC","VOL","RET","BID","ASK","SHROUT","OPENPRC","EXCHCD","DLSTCD","DLRET"]);

%sort by permno and date
CRSP=sortrows(CRSP,["PERMNO","date"]);

%restrict to permnos that show up in monthly file
CRSP = CRSP(ismember(CRSP{:,"PERMNO"},permnoun),:);

%unique trading days
datesun = unique(CRSP{:,"date"});

CRSP{:,"PRC"}=abs(CRSP{:,"PRC"}); %negative prices indicate bid-ask-midpoints

%Create wide matrices from long matrix
retm = table2array(unstack(CRSP(:,["PERMNO","date","RET"]),'RET','PERMNO'));
retm = sortrows(retm,1);
retm = retm(:,2:end);
bads = (sum(~isnan(retm),2)==0); %delete days without any return data
retm = retm(~bads,:);

exccm = table2array(unstack(CRSP(:,["PERMNO","date","EXCHCD"]),'EXCHCD','PERMNO'));
exccm = sortrows(exccm,1);
exccm = exccm(~bads,2:end);

shoutm = table2array(unstack(CRSP(:,["PERMNO","date","SHROUT"]),'SHROUT','PERMNO'));
shoutm = sortrows(shoutm,1);
shoutm = shoutm(~bads,2:end);
shoutm(shoutm==0) = NaN;

volm = table2array(unstack(CRSP(:,["PERMNO","date","VOL"]),'VOL','PERMNO'));
volm = sortrows(volm,1);
volm = volm(~bads,2:end);

prcm = table2array(unstack(CRSP(:,["PERMNO","date","PRC"]),'PRC','PERMNO'));
prcm = sortrows(prcm,1);
prcm = prcm(~bads,2:end);

datesun = datesun(~bads,:);

mrkcm  = prcm.*shoutm;          %market capitalization
trvm   = prcm.*volm;            %dollar trading volume
tom    = trvm./mrkcm*0.001;     %turnover, since market capitalization is in 1000


%load stock split data from daily CRSP database
%downloaded from WRDS on May 1, 2025
%shrcd BETWEEN 9.5 AND 11.5 AND exchcd BETWEEN 0.5 AND 3.5 AND facpr IS NOT NULL AND facpr != -1 AND facpr != 0
%Variables: PERMNO,date,shrcd,exchcd,facpr
table_stsplit = readtable('StockSplitData.csv','Delimiter',',','Format','%f%f%f%f%f','HeaderLines',0,'ReadVariableNames', true);
table_stsplit=sortrows(table_stsplit,["PERMNO","date"]);

matlabdates = datenum(num2str(dataset.date),'yyyymmdd'); %#ok<DATNM>
table_stsplit{:,"mdate"} = datenum(num2str(table_stsplit{:,"date"}),'yyyymmdd'); %#ok<DATNM>

%create matrix of split-adjusted prices
splitadjprcm = prcm;
for j = 1:size(table_stsplit,1)
    column = sum(permnoun<table_stsplit{j,"PERMNO"})+1;
    row    = sum(datesun<table_stsplit{j,"date"})+1;
    splitadjprcm(row:end,column) = splitadjprcm(row:end,column) * (table_stsplit{j,"FACPR"}+1);
end



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%CONTRUCT DAILY FACTOR MATRIX
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

dFact = array2table(datesun);

%load daily factor data from Kenneth French's Homepage
dFact=outerjoin(dFact,readtable('F-F_Research_Data_Factors_daily.csv','Delimiter',',','Format','%f%f%f%f%f','ReadVariableNames', true,'HeaderLines', 0),'Keys',[1 1],'MergeKeys',true);

%do not keep factors as percentage numbers
dFact{:,2:end} = dFact{:,2:end}*0.01;

dFact.Properties.VariableNames(1) = "date";


%%%%%%%%%%
%Add return-based variables to monthly dataset

dayspermonth = [monthun,NaN(length(monthun),2)];
for j=1:length(dayspermonth)
    dayspermonth(j,2) = max(datesun(dayspermonth(j,1)==floor(datesun/100)));
    dayspermonth(j,3) = sum(dayspermonth(j,1)==floor(datesun/100));
end

[~, matching1] = ismember(dataset.PERMNO,permnoun);    %column in wide-matrices to select correct firm
[~, matching2] = ismember(dataset.date,datesun);       %row in daily wide-matrices
[~, matching3] = ismember(dataset.month,monthun);      %row in monthly wide-matrices

vars = ["MARKETCAP","SIZE_DEC","BETA","IVOL","VOL","TO","CEI",...
        "RETm1mon","RETm2mon","RETm3mon","RETm1yr","RETm2yr","RETm3yr","RETm4yr","RETm5yr"];

for j=1:length(vars)
     dataset.(vars{j}) = NaN(size(dataset.PERMNO,1),1);
end

dataset.DAILYRET = NaN(size(dataset.PERMNO,1),28);

for j=1:size(dataset.PERMNO,1)
   if matching3(j,1)>=60 
     k1 = matching1(j,1);      %column in wide-matrices selecting correct firm
     k2 = matching2(j,1);      %row in daily wide-matrices 
     k3 = matching3(j,1);      %row in monthly wide-matrices

     %market capitalization
     dataset.MARKETCAP(j,1) = mrkcm(k2,k1);

     %log market cap at the end of December to calculate BM ratio
     dataset.SIZE_DEC(j,1) = log(mrkcm(find(rem(floor(datesun(1:(k2-sum(dayspermonth(k3-5:k3,3))),1)/100),100)==12,1,'last'),k1)*1000); 
     
     %market beta
     if sum(~isnan(retm(k2-sum(dayspermonth(k3-11:k3,3))+1:k2,k1)))>=200
        covmat=cov(retm(k2-sum(dayspermonth(k3-11:k3,3))+1:k2,k1)-dFact{k2-sum(dayspermonth(k3-11:k3,3))+1:k2,"RF"},dFact{k2-sum(dayspermonth(k3-11:k3,3))+1:k2,"Mkt_RF"},'omitrows');
        dataset.BETA(j,1) = covmat(1,2)/covmat(2,2); 
     end
     
     if sum(~isnan(retm((k2-dayspermonth(k3,3)+1):k2,k1)))>=15
         %idiosyncratic return volatility as in Ang et al. (2006)
         x = [ones(dayspermonth(k3,3),1), dFact{(k2-dayspermonth(k3,3)+1):k2,["Mkt_RF","SMB","HML"]}];
         y = retm((k2-dayspermonth(k3,3)+1):k2,k1) - dFact{(k2-dayspermonth(k3,3)+1):k2,"RF"};
         x = x(~isnan(y),:);
         y = y(~isnan(y),:);
         b = x\y;
         res = y - x * b; 
         dataset.IVOL(j,1) = (mean(res.^2))^0.5*252^0.5; 
         
         %standard deviation of excess returns in previous month
         dataset.VOL(j,1) = std(y)*252^0.5; 
     end
    
     %returns of the previous 5 years
     dataset.RETm1yr(j,1) = prod(retmmonth((k3-11):(k3- 0),k1)+1)-1;
     dataset.RETm2yr(j,1) = prod(retmmonth((k3-23):(k3-12),k1)+1)-1;
     dataset.RETm3yr(j,1) = prod(retmmonth((k3-35):(k3-24),k1)+1)-1;
     dataset.RETm4yr(j,1) = prod(retmmonth((k3-47):(k3-36),k1)+1)-1;
     dataset.RETm5yr(j,1) = prod(retmmonth((k3-59):(k3-48),k1)+1)-1;

     %turnover last month
     dataset.TO(j,1) = sum(tom(k2-dayspermonth(k3,3)+1:k2,k1));
     
     %composite equity issuance over previous year (cf Daniel/Titman, 2006)
     %change in market cap unexplained by stock return
     dataset.CEI(j,1) = mrkcm(k2,k1)/mrkcm(k2-sum(dayspermonth(k3-11:k3,3)),k1) - prod(retmmonth(k3-11:k3,k1)+1);

     %monthly stock returns over previous year 
     dataset.RETm1mon(j,1)  = retmmonth(k3,k1);
     dataset.RETm2mon(j,1)  = retmmonth(k3-1,k1);
     dataset.RETm3mon(j,1)  = retmmonth(k3-2,k1);

   end 
end


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%process short interest data
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% COMPUSTAT-files were downloaded from WRDS on May 2, 2025
% download from Compustat - Capital IQ - CompustatNorth America - Daily - Compustat Daily Updates - Supplemental Short Interest File
% Company Code: cusip
% search entire database

table_shortint = readtable('ShortInterestfromCOMPUSTAT.csv','Delimiter',',','Format','%s%s%s%s%f%f');
table_shortint = table_shortint(~strcmp(table2cell(table_shortint(:,"cusip")),''),:);
temp = cell2mat(table_shortint{:,"cusip"});
table_shortint(:,"cusip") = cell2table(cellstr(temp(:,1:8))); %transform 9-digit cusip to 8 digit

table_shortint{:,"date"} = yyyymmdd(datetime(table_shortint{:,"datadate"},'InputFormat','yyyy-MM-dd'));
table_shortint{:,"month"} = floor(table_shortint{:,"date"}/100);
table_shortint = table_shortint(:,["cusip","month","shortint","date"]);
table_shortint = sortrows(table_shortint,["cusip","month"]);

% delete cusip-month-duplicates
bads = [0 ;table_shortint{2:end,"month"}==table_shortint{1:end-1,"month"} & strcmp(table2cell(table_shortint(2:end,"cusip")),table2cell(table_shortint(1:end-1,"cusip")))];
table_shortint=table_shortint(~bads,:);

%if date is no CRSP trading day, use previous day instead
temp = table2array(unique(table_shortint(:,"date")));
temp(isnan(temp)) = [];
temp = [temp, NaN(size(temp))];
for j = 1:size(temp,1)
    temp(j,2) = max(datesun(datesun<=temp(j,1)));
end
table_shortint = outerjoin(table_shortint, array2table(temp),'LeftKeys',"date",'RightKeys',"temp1",'Type','left','LeftVariables',1:4,'RightVariables',2);

%merge with number of shares outstanding from CRSP
table_shortint = outerjoin(table_shortint, CRSP(:,["CUSIP","date","SHROUT"]),'LeftKeys',["cusip","temp2"],'RightKeys',["CUSIP","date"],'Type','left','LeftVariables',1:5,'RightVariables',3);
table_shortint{:,"SHORTINT"} = table_shortint{:,"shortint"} ./ table_shortint{:,"SHROUT"} * 0.001; %calculate relative short interest
dataset.SHORTINT = outerjoin([dataset.CUSIP,array2table([dataset.PERMNO, dataset.month])], table_shortint,'LeftKeys',["CUSIP","Var2"],'RightKeys',["cusip","month"],'Type','left','LeftVariables',[2 3],'RightVariables',"SHORTINT");
dataset.SHORTINT = sortrows(dataset.SHORTINT,1:2);
dataset.SHORTINT = dataset.SHORTINT{:,3}; %relative short interest


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%process accounting data
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% COMPUSTAT-files were downloaded from WRDS on February 6, 2025
% download from CRSP-COMPUSTAT-MERGED
% IDENTIFIER: LPERMNO
% LINKTYPES = LC or LU or LS
% no share code restrictions

table_COMP = readtable('COMPUSTAT.csv','Delimiter',',','Format','%f%f%s%f%f%s%s%s%s%s%s%s%s%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%f%c%f%f');
table_COMP = sortrows(table_COMP,["LPERMNO", "datadate"]);

%Calculate Book Value of Equity
table_COMP{:,"BVPS"} = zeros(size(table_COMP,1),1);%calculate book value of preferred stock
table_COMP(~isnan(table_COMP{:,"pstk"}),"BVPS")   = table_COMP(~isnan(table_COMP{:,"pstk"}),"pstk");
table_COMP(~isnan(table_COMP{:,"pstkl"}),"BVPS")  = table_COMP(~isnan(table_COMP{:,"pstkl"}),"pstkl");
table_COMP(~isnan(table_COMP{:,"pstkrv"}),"BVPS") = table_COMP(~isnan(table_COMP{:,"pstkrv"}),"pstkrv");
table_COMP{isnan(table_COMP{:,"txditc"}),"txditc"}= 0; %set deferred taxes to zero if NAN
table_COMP{:,"BVE"} = table_COMP{:,"seq"} + table_COMP{:,"txditc"} - table_COMP{:,"BVPS"} ;%book value of equity
table_COMP{isnan(table_COMP{:,"BVE"}),"BVE"} = table_COMP{isnan(table_COMP{:,"BVE"}),"ceq"} + table_COMP{isnan(table_COMP{:,"BVE"}),"txditc"} ;%use ceq if seq is not available
table_COMP{isnan(table_COMP{:,"BVE"}),"BVE"} = table_COMP{isnan(table_COMP{:,"BVE"}),"at"} - table_COMP{isnan(table_COMP{:,"BVE"}),"lt"} ;%use difference between assets and liabilities if ceq is also not available

%Operating Profits as in Fama/French (2015)
bads = (isnan(table_COMP{:,"cogs"}) & isnan(table_COMP{:,"xsga"}) & isnan(table_COMP{:,"xint"}));  %require at least one of the cost items
table_COMP{isnan(table_COMP{:,"cogs"}),"cogs"}  = 0;
table_COMP{isnan(table_COMP{:,"xsga"}),"xsga"}  = 0;
table_COMP{isnan(table_COMP{:,"xint"}),"xint"}  = 0;
table_COMP{:,"OP"} = (table_COMP{:,"revt"} - table_COMP{:,"cogs"} - table_COMP{:,"xsga"} - table_COMP{:,"xint"}) ./ table_COMP{:,"BVE"} ;
table_COMP{bads,"OP"} = NaN;
table_COMP{table_COMP{:,"BVE"}<=0,"OP"} = NaN; %keep OP-estimate only in the case of positive BVE

%merge to dataset struct
dataset.accountingyear = floor((dataset.date-600)/10000)-1; %relevant accounting year
table_COMP{:,"datadate"} = floor(table_COMP{:,"datadate"}/10000);
[~,temp,~] = unique(table_COMP(:,["LPERMNO", "datadate"]),'last');
table_COMP = table_COMP(temp,:);
dataset.OP   = table2array(outerjoin(array2table([dataset.PERMNO,dataset.accountingyear,dataset.date]), table_COMP(:,["LPERMNO", "datadate", "OP"  ]),'LeftKeys',["Var1","Var2"],'RightKeys',["LPERMNO","datadate"],'Type','left','LeftVariables',[],'RightVariables',3));
dataset.BM   = table2array(outerjoin(array2table([dataset.PERMNO,dataset.accountingyear,dataset.date]), table_COMP(:,["LPERMNO", "datadate", "BVE" ]),'LeftKeys',["Var1","Var2"],'RightKeys',["LPERMNO","datadate"],'Type','left','LeftVariables',[],'RightVariables',3))*1000000 ./ exp(dataset.SIZE_DEC(:,1));
dataset.EP   = table2array(outerjoin(array2table([dataset.PERMNO,dataset.accountingyear,dataset.date]), table_COMP(:,["LPERMNO", "datadate", "ib"  ]),'LeftKeys',["Var1","Var2"],'RightKeys',["LPERMNO","datadate"],'Type','left','LeftVariables',[],'RightVariables',3))*1000000 ./ exp(dataset.SIZE_DEC(:,1));


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%add analyst dispersion
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% IBES-file (unadjusted summary file on EPS forecasts)
% downloaded from WRDS on May 2, 2025
% cusip identifier, global IBES universe, EPS, 1 year fiscal period indic
table_IBESsummary = readtable('IBES_unadjustedSummary.csv','Delimiter',',','Format','%s%s%s%s%f%s%s%s%s%s%f%f%f%f%f%f%f%f%s','CommentStyle',{'"', '"'});
table_IBESsummary = table_IBESsummary(strcmp(table_IBESsummary{:,"FISCALP"},'ANN'),:); %only annual EPS forecasts
table_IBESsummary = table_IBESsummary(strcmp(table_IBESsummary{:,"FPI"},'1'),:); %EPS forecast for next year
table_IBESsummary = table_IBESsummary(~isnan(table_IBESsummary{:,"STDEV"}),:);
table_IBESsummary{:,"STATPERS"} = floor(table_IBESsummary{:,"STATPERS"}/100);

% cusips in IBES files are historical cusips (i.e., ncusips)
table_IBESsummary.Properties.VariableNames("CUSIP") = "NCUSIP";

% We download a monthly CRSP file with cusips and ncusips and permnos
% to replace all IBES ncusips by permnos
mergedata  = readtable('ncsuip-permno-cusip-merge.csv','Delimiter',',','Format','%f%f%s%s'); %permno, date, NCUSIP, CUSIP
mergedata  = mergedata(~strcmp(mergedata{:,"NCUSIP"},''),:);
mergedata  = unique(mergedata(:,["NCUSIP","PERMNO"]),'rows');
mergedata  = mergedata(ismember(mergedata{:,"NCUSIP"},table_IBESsummary{:,"NCUSIP"}),:);
table_IBESsummary = outerjoin(table_IBESsummary, mergedata(:,["NCUSIP", "PERMNO" ]),'LeftKeys',"NCUSIP",'RightKeys',"NCUSIP",'Type','left');
table_IBESsummary = sortrows(table_IBESsummary,["PERMNO", "STATPERS"]);

table_IBESsummary{table_IBESsummary{:,"MEANEST"}==0,"MEANEST"}=NaN;
table_IBESsummary{:,"DISP"} = table_IBESsummary{:,"STDEV"} ./ abs(table_IBESsummary{:,"MEANEST"});
[~,temp,~] = unique(table_IBESsummary(:,["PERMNO", "STATPERS"]),'last');
table_IBESsummary = table_IBESsummary(temp,:);
dataset.DISP     = table2array(outerjoin(array2table([dataset.PERMNO,dataset.month]), table_IBESsummary(:,["PERMNO", "STATPERS", "DISP"  ]),'LeftKeys',["Var1","Var2"],'RightKeys',["PERMNO","STATPERS"],'Type','left','LeftVariables',[],'RightVariables',3));
dataset.nANALYST = table2array(outerjoin(array2table([dataset.PERMNO,dataset.month]), table_IBESsummary(:,["PERMNO", "STATPERS", "NUMEST"]),'LeftKeys',["Var1","Var2"],'RightKeys',["PERMNO","STATPERS"],'Type','left','LeftVariables',[],'RightVariables',3));



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%add standardized unexpected earnings SUE
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% quarterly COMPUSTAT-file was downloaded from WRDS on May 2, 2025
% download from CRSP-COMPUSTAT-MERGED
% IDENTIFIER: LPERMNO
% LINKTYPES = LC or LU or LS
% no share code restrictions

table_COMPQ = readtable('COMPUSTAT_quarterly.csv');
table_COMPQ = sortrows(table_COMPQ,["LPERMNO","datadate"]);
table_COMPQ = table2array(table_COMPQ(:,["LPERMNO","rdq","datadate","ibq"])); %PERMNO, report date, fiscal end date, earnings excluding extraordinary items
table_COMPQ(:,3) = floor(table_COMPQ(:,3)/100);
table_COMPQ = [table_COMPQ, NaN(size(table_COMPQ,1),1)];
table_COMPQ = [NaN(4,5);table_COMPQ];
table_COMPQ = table_COMPQ(~isnan(table_COMPQ(:,4)),:);

%Calculate stand. unexpected earnings following Bali et al. (2014)
for j=9:size(table_COMPQ,1)
    temp = table_COMPQ(j-8:j,:);
    temp = temp(temp(:,3)>temp(9,3)-201 & temp(:,1)==temp(9,1),:);
    tempref = temp(temp(:,3)==temp(end,3)-100,4);
    if size(temp,1)>3 && ~isempty(tempref) 
        table_COMPQ(j,5) = (temp(end,4) - tempref(end,:)) ./ std(temp(:,4),"omitmissing");
    end
end

dataset.QRD = NaN(size(dataset.PERMNO,1),1);
dataset.SUE = NaN(size(dataset.PERMNO,1),1);
for j=1:length(dataset.date)
    % we use the last SUE from the previous year
    % earnings announcements on the last day of the formation month are not
    % considered in that month because information might have become public
    % after market close
    temp=table_COMPQ(table_COMPQ(:,1)==dataset.PERMNO(j,1) & table_COMPQ(:,2)<dataset.date(j,1) & table_COMPQ(:,2)>dataset.date(j,1)-10000,[2 5]); 
    if ~isempty(temp)
        dataset.QRD(j,1) = temp(end,1); %quarterly report date
        dataset.SUE(j,1) = temp(end,2);
    end
end



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%add different versions of CGO based on weekly data
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%Aggregate to weekly data
days = datesun; %yyyymmdd
days(:,2)   = datenum(num2str(days(:,1)),'yyyymmdd'); %#ok<DATNM>
days(:,3:4) = NaN(size(days));
sundays = days(:,2)-weekday(days(:,2))+1; %previous sunday for each date
sundays = unique(sundays);
for j=2:length(days(:,3))
    days(j,  3) = sum(sundays<days(j,2)); %week number
    days(j-1,4) = days(j,3)>days(j-1,3);  %indicator: 1 if last day of week
end
days(1,3:4)=1;days(end,4)=1;
weeksun = days(days(:,4)==1,1);

% calculate weekly turnover and price matrices
tom_weekly          = NaN(length(weeksun),size(tom,2));
sadjprcm_weekly     = NaN(length(weeksun),size(tom,2));
for j=1:length(weeksun)
    sadjprcm_weekly(j,:)   = splitadjprcm(days(:,1)==weeksun(j,1),:); 
    tom_weekly(j,:)        = sum(tom(days(:,3)==j,:),1,"omitnan");
    tom_weekly(j,mean(isnan(tom(days(:,3)==j,:)),1)==1) = NaN;
end

tom_weekly(tom_weekly>1) = 1;%set larger values to 1 to calculate capital gains measure

cgo_sadjprc      = NaN(length(weeksun),size(tom,2));
pgain_sadjprc    = NaN(length(weeksun),size(tom,2));
cgo_sadjprc_med  = NaN(length(weeksun),size(tom,2));

lastweekofmonth = floor(weeksun(:,1)/100);
lastweekofmonth(:,2) = [lastweekofmonth(1:end-1,:) < lastweekofmonth(2:end,:) ; 1];

for j=261:length(weeksun)
if lastweekofmonth(j,2)==1

    tom_weeklyrel       = tom_weekly(j-260:j-1,:);
    sadjprcm_weeklyrel  = sadjprcm_weekly(j-260:j-1,:);
    
    %require prices and turnover in the previous 100 weeks
    badstocks = (mean(isnan(tom_weeklyrel(end-99:end,:)))~=0) | (mean(isnan(sadjprcm_weeklyrel(end-99:end,:)))~=0);
    badstockmonths = (isnan(tom_weeklyrel) | isnan(sadjprcm_weeklyrel));

    tom_weeklyrel      (badstockmonths) = 0; %no weight on observations without trading volume or price data
    sadjprcm_weeklyrel (badstockmonths) = 0; %no weight on observations without trading volume or price data
    
    weights=ones(260,size(tom,2));
    for n = 1:259
        weights(260-n,:) = weights(261-n,:) .* (1-tom_weeklyrel(261-n,:));
    end
    weights(:,badstocks)    = NaN;
    weights(badstockmonths) = 0;

    refprice_sadj      = sum(weights .* tom_weeklyrel .*  sadjprcm_weeklyrel ) ./ sum(weights .* tom_weeklyrel);
    cgo_sadjprc(j,:)   = (sadjprcm_weekly(j-1,:)  - refprice_sadj)  ./ refprice_sadj;
    pgain_sadjprc(j,:) = sum(weights .* tom_weeklyrel .*  (sadjprcm_weekly(j-1,:) - sadjprcm_weeklyrel > 0) ) ./ sum(weights .* tom_weeklyrel);


    %use median turnover pooled over 5 years to calculate CGO
    tom_weeklyrel = tom_weekly(j-260:j-1,:);
    tom_weeklyrel = median(tom_weeklyrel,"all","omitmissing");

    weights=ones(260,size(tom,2));
    for n = 1:259
        weights(260-n,:) = weights(261-n,:) .* (1-tom_weeklyrel);
    end
    weights(:,badstocks)    = NaN;
    weights(badstockmonths) = 0;
    
    refprice_sadj        = sum(weights .* tom_weeklyrel .*  sadjprcm_weeklyrel ) ./ sum(weights .* tom_weeklyrel);
    cgo_sadjprc_med(j,:) = (sadjprcm_weekly(j-1,:)  - refprice_sadj)  ./ refprice_sadj;

end
end

%only keep last observation of month 
%and transform matrices from wide to long format
a=repmat(transpose(lastweekofmonth(lastweekofmonth(:,2)==1,1)),length(permnoun),1);
b=repmat(permnoun,1,sum(lastweekofmonth(:,2)));
a=reshape(a,numel(a),1);
b=reshape(b,numel(b),1);
dataset.cgo_sadjprc       = array2table([b,a,reshape(transpose(cgo_sadjprc    (lastweekofmonth(:,2)==1,:)),numel(a),1)]);
dataset.pgain_sadjprc     = array2table([b,a,reshape(transpose(pgain_sadjprc  (lastweekofmonth(:,2)==1,:)),numel(a),1)]);
dataset.cgo_sadjprc_med   = array2table([b,a,reshape(transpose(cgo_sadjprc_med(lastweekofmonth(:,2)==1,:)),numel(a),1)]);

dataset.cgo_sadjprc       = table2array(outerjoin(array2table([dataset.PERMNO,dataset.month]), dataset.cgo_sadjprc,     'LeftKeys',["Var1","Var2"],'RightKeys',["Var1","Var2"],'Type','left','LeftVariables',[],'RightVariables',3));
dataset.pgain_sadjprc     = table2array(outerjoin(array2table([dataset.PERMNO,dataset.month]), dataset.pgain_sadjprc,   'LeftKeys',["Var1","Var2"],'RightKeys',["Var1","Var2"],'Type','left','LeftVariables',[],'RightVariables',3));
dataset.cgo_sadjprc_med   = table2array(outerjoin(array2table([dataset.PERMNO,dataset.month]), dataset.cgo_sadjprc_med, 'LeftKeys',["Var1","Var2"],'RightKeys',["Var1","Var2"],'Type','left','LeftVariables',[],'RightVariables',3));


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%calculate CGO based on daily returns 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

tom_daily = tom; 
tom_daily(tom_daily>1) = 1;%set larger values to 1 in order to calculate capital gains measure

cgo_sadjprc       = NaN(size(tom_daily,1),size(tom_daily,2));

lastdayofmonth = floor(datesun(:,1)/100);
lastdayofmonth(:,2) = [lastdayofmonth(1:end-1,:) < lastdayofmonth(2:end,:) ; 1];

for j = 1261:size(tom_daily,1)
if lastdayofmonth(j,2)==1

    tom_dailyrel      = tom_daily(j-1260:j-1,:);
    sadjprcm_dailyrel = splitadjprcm(j-1260:j-1,:);

    %require prices and turnover in the previous 500 days
    badstocks = (mean(isnan(tom_dailyrel(end-500:end,:)))~=0) | (mean(isnan(sadjprcm_dailyrel(end-500:end,:)))~=0);
    badstockmonths = (isnan(tom_dailyrel) | isnan(sadjprcm_dailyrel));

    tom_dailyrel     (badstockmonths) = 0; %no weight on observations without trading volume or price data
    sadjprcm_dailyrel(badstockmonths) = 0; %no weight on observations without trading volume or price data
    
    weights=ones(1260,size(tom,2));
    for n = 1:1259
        weights(1260-n,:) = weights(1261-n,:) .* (1-tom_dailyrel(1261-n,:));
    end
    weights(:,badstocks)    = NaN;
    weights(badstockmonths) = 0;

    refprice_sadj  = sum(weights .* tom_dailyrel .*  sadjprcm_dailyrel ) ./ sum(weights .* tom_dailyrel);
    cgo_sadjprc(j,:)  = (splitadjprcm(j-1,:)  - refprice_sadj)  ./ refprice_sadj;

end
end

%only keep last observation of month 
%and transform matrices from wide to long format
a=repmat(transpose(lastdayofmonth(lastdayofmonth(:,2)==1,1)),length(permnoun),1);
b=repmat(permnoun,1,sum(lastdayofmonth(:,2)));
a=reshape(a,numel(a),1);
b=reshape(b,numel(b),1);
dataset.cgo_sadjprc_daily = array2table([b,a,reshape(transpose(cgo_sadjprc(lastdayofmonth(:,2)==1,:)),numel(a),1)]);
dataset.cgo_sadjprc_daily = table2array(outerjoin(array2table([dataset.PERMNO,dataset.month]), dataset.cgo_sadjprc_daily,      'LeftKeys',["Var1","Var2"],'RightKeys',["Var1","Var2"],'Type','left','LeftVariables',[],'RightVariables',3));



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%save dataset for further analyses
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
data_for_further_analyses = ...
    [dataset.PERMNO, ...
     dataset.month, ...
     dataset.date, ...
     dataset.MARKETCAP, ... 
     dataset.CEI, ...
     dataset.BETA, ...
     dataset.IVOL, ...
     dataset.SUE, ...
     dataset.QRD, ... 
     dataset.SHORTINT, ...
     dataset.TO, ...
     dataset.BM, ...
     dataset.OP, ...
     dataset.EP, ...
     dataset.DISP, ...
     dataset.nANALYST, ...
     dataset.VOL, ...
     dataset.RETm1mon, ...
     dataset.RETm2mon, ...
     dataset.RETm3mon, ...
     dataset.RETm1yr, ...
     dataset.RETm2yr, ...
     dataset.RETm3yr, ...
     dataset.RETm4yr, ...
     dataset.RETm5yr, ...
     dataset.cgo_sadjprc, ...
     dataset.pgain_sadjprc, ...
     dataset.cgo_sadjprc_med, ...
     dataset.cgo_sadjprc_daily];
dlmwrite('data_for_further_analyses.csv',data_for_further_analyses(data_for_further_analyses(:,2)>200000,:), 'delimiter', ',', 'precision', 16); %#ok<DLMWT>


